titanic = read.csv("Titanic_train.csv"); colnames(titanic)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
colnames(titanic)[5] <- "Gender" ## revise the name of a column
head(titanic[,1:5], 4)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## Name Gender
## 1 Braund, Mr. Owen Harris male
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3 Heikkinen, Miss. Laina female
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
What would be some good features to be considered?
head(titanic[,6:ncol(titanic)], 4)
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 22 1 0 A/5 21171 7.2500 S
## 2 38 1 0 PC 17599 71.2833 C85 C
## 3 26 0 0 STON/O2. 3101282 7.9250 S
## 4 35 1 0 113803 53.1000 C123 S
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 416 581 ...
## $ Gender : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 525 596 662 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
titanic$Survived = factor(titanic$Survived, labels=c("died", "survived"))
titanic$Embarked = factor(titanic$Embarked, labels=c("Unkown", "Cherbourg", "Queenstown", "Southampton"))
str(titanic$Survived)
## Factor w/ 2 levels "died","survived": 1 2 2 2 1 1 1 1 2 2 ...
str(titanic$Gender)
## Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
survivedTable = table(titanic$Survived); survivedTable
##
## died survived
## 549 342
par(mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0))
pie(survivedTable,labels=c("Died","Survived"))
par(mfrow = c(1, 2), mar = c(0, 0, 2, 0), oma = c(0, 1, 0, 1))
pie(table(titanic[titanic$Gender=="male", "Survived"]), labels=c("Dead","Survived"), main="Survival Portion of Men")
pie(table(titanic[titanic$Gender=="female", "Survived"]), labels=c("Dead","Survived"), main="Survival Portion of Women")
Age <- titanic$Age; summary(Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 20.12 28.00 29.70 38.00 80.00 177
summary(titanic[titanic$Survived=="died", "Age"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 21.00 28.00 30.63 39.00 74.00 125
summary(titanic[titanic$Survived=="survived", "Age"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 19.00 28.00 28.34 36.00 80.00 52
hist(Age, col="blue", xlab="Age", ylab="Frequency",
main = "Distribution of Passenger Ages on Titanic",
cex.lab=1.6, cex.axis=1.4, cex.main=1.6)
d = density(na.omit(Age)) # density() requires all NAs to be removed
plot(d, main = "kernel density of Ages of Passengers", xlab="Age", cex.lab=1.6, cex.axis=1.4); polygon(d, col="red", border="blue")
An example of feature engineering!
## Multi dimensional comparison
Child <- titanic$Age # Isolating age.
## Now we need to create categories: NA = Unknown, 1 = Child, 2 = Adult
## Every age below 13 (exclusive) is classified into age group 1
Child[Child<13] <- 1
## Every child 13 or above is classified into age group 2
Child[Child>=13] <- 2
## Use labels instead of 0's and 1's
Child[Child==1] <- "Child"
Child[Child==2] <- "Adult"
# Appends the new column to the titanic dataset
titanic_with_child_column <- cbind(titanic, Child)
# Removes rows where age is NA
titanic_with_child_column <- titanic_with_child_column[!is.na(titanic_with_child_column$Child),]
## Show part of the data frame with new feature: Child
head(titanic_with_child_column[,c(1:3, 13)], 4)
## PassengerId Survived Pclass Child
## 1 1 died 3 Adult
## 2 2 survived 1 Adult
## 3 3 survived 3 Adult
## 4 4 survived 1 Adult
data(diamonds) # loading diamonds data set
head(diamonds, 9) # first few rows of diamond data set
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
library(ggplot2)
ggplot(data=diamonds) + geom_histogram(aes(x=carat), binwidth=1/5) +
theme(text = element_text(size = 25))
ggplot(data=diamonds) +
geom_density(aes(x=carat),fill="gray50")+
theme(text = element_text(size = 25))
ggplot(diamonds, aes(x=carat,y=price)) + geom_point() +
theme(text = element_text(size = 25))
g = ggplot(diamonds, aes(x=carat, y=price)) # saving first layer as variable
g + geom_point(aes(color=color)) + theme(text = element_text(size = 25)) # rendering first layer and adding another layer
g + geom_point(aes(color=color)) + facet_wrap(~color) + theme(text = element_text(size = 20))
What is your knowledge of diamond's price after exploring this data?
/
#